---
title: "Howe, Philip J., Steinecke, David & Zuber, Christina Isabel (2024). Competing Principals in a Multinational State: Legislative Behavior in Imperial Austria, 1907-1914."
output: html_notebook
---

```{r setup}
library(tidyverse)
library(glue)
library(janitor)
library(readxl)
library(haven)
```

# General data preparation

## Load data

```{r}
rcv <- read_excel("data/roll_call_votes.xlsx") %>% 
  select(-c(petitioner, page_in_protokolle))

reps_1907 <- read_excel("data/reps_1907.xlsx") %>% 
  select(-c(section_181))

reps_1911 <- read_excel("data/reps_1911.xlsx") %>% 
  select(-c(section_21, notes_from_adlgasser_coding_procedure, district_id))

district_id <- read_excel("data/district_id.xlsx")

clubs <- read_excel("data/clubs.xlsx") %>% 
  select(election:session_detailed, rep, rep_unique, district_id:district_name, 
         district:replacement, hospitant, klub, klub_org, v_lawyer:v_chrisworker) 

party_labels_1907 <- read_excel("data/party_labels.xlsx", 
                                sheet = "1907")

party_labels_1911 <- read_excel("data/party_labels.xlsx", 
                                sheet = "1911")
```


## Merge club membership and representative data

```{r}
reps_1907_clubs <- clubs %>% 
  mutate(rep_unique_ = rep_unique %>% 
           str_replace("(,\\s*\\S+).*", "\\1") %>% 
           str_remove("\\s*\\(.*\\)") %>% 
           trimws()) %>% 
  separate(rep_unique_, c("rep_fam_name", "rep_first_name"), 
           sep = ",", remove = FALSE) %>% 
  left_join(reps_1907 %>% 
              select(-c(election, klub, starts_with("v_"))), 
            by = c("session", "session_detailed", "district", "district_name", "rep_fam_name")) %>% 
  mutate(province = case_when(land == "Böhmen" ~ "Bohemia",
                              land == "Bukowina" ~ "Bukovina",
                              land == "Dalmatien" ~ "Dalmatia",
                              land == "Galizien" ~ "Galicia",
                              land == "Görz und Gradisca" ~ "G&G",
                              land == "Istrien" ~ "Istria",
                              land == "Kärnten" ~ "Carinthia",
                              land == "Krain" ~ "Carniola",
                              land == "Mähren deutsch" ~ "Moravia-German",
                              land == "Mähren tschechisch" ~ "Moravia-Czech",
                              land == "Niederösterreich" ~ "LA",
                              land == "Oberösterreich" ~ "UA",
                              land == "Schlesien" ~ "Silesia",
                              land == "Steiermark" ~ "Styria",
                              land == "Tirol" ~ "Tyrol",
                              land == "Triest" ~ "Trieste",
                              TRUE ~ province),
         name_adlg = case_when(is.na(name_adlg) ~ rep,
                               TRUE ~ name_adlg)) %>% 
  select(election:session_detailed, district_name:district, name_adlg, province, 
         klub, v_lawyer:v_chrisworker)
```


```{r}
reps <- reps_1907_clubs %>% bind_rows(reps_1911) %>% 
  select(election:v_chrisworker, v_priest:v_teacher) %>% 
  mutate(row = row_number(),
         klub = case_when(klub == "." ~ NA_character_,
                          TRUE ~ klub),
         name_adlg_ = name_adlg %>% 
           stringi::stri_trans_general("Latin-ASCII") %>% 
           str_replace("(,\\s*\\S+).*", "\\1") %>% 
           str_remove("\\s*\\(.*\\)") %>% 
           trimws(),
         name_adlg_ = case_when(name_adlg_ == "de Gentili, Guido" ~ "Gentili, Guido",
                                name_adlg_ == "Guggenberg zu Riedhofen, Atanas" ~ "Guggenberg, Atanas",
                                name_adlg_ == "Habermann," ~ "Habrman, Gusta",
                                name_adlg_ == "Leys zu Paschpach, Emil" ~ "Leys, Emil",
                                name_adlg_ == "Pad'our, Jindrich" ~ "Padour, Jindrich",
                                name_adlg_ == "Hofmann v. Wellenhof, Paul" ~ "Hofmann von Wellenhof, Paul",
                                name_adlg_ == "Sesardic, Ante" ~ "Sefardic, Ante",
                                TRUE ~ name_adlg_)) %>% 
  separate(name_adlg_, c("rep_fam_name", "rep_first_name"), 
           sep = ",", remove = FALSE)
```


## Merge roll call vote and representative data

```{r}
rcv_reps <- rcv %>% 
  mutate(rep_fam_name = rep_fam_name %>% 
           str_remove("\\s*\\(.*\\)") %>% 
           trimws()) %>% 
  left_join(reps, by = c("session", "session_detailed", "district", "rep_fam_name")) %>% 
  mutate(dist_num = district_name %>% str_extract("\\d{1,2}") %>% as.numeric(),
         district = case_when(province == "Galicia" & dist_num >= 35 ~ 
                                str_remove(district, "I{1,2}"),
                              TRUE ~ district)) %>% 
  select(legislature, rcv, bill, topic, issue_area, issue, session, session_detailed,
         vote_date, rep_unique, rep_id, rep_vote_id, vote, province,
         district_name, district, elected_date, sworn_in_date, drop_out_date, 
         replacement, hospitant, klub:v_teacher, dist_num, rep_fam_name, name_adlg) %>% 
  arrange(rep_id)
```


## Prepare district data for first term (1907-1911)

```{r}
district_1907_r1 <- read_excel("data/district_data_1907.xlsx", 
                               sheet = "Round 1") %>% 
  select(election, district_name, precincts, province, dm, pop, 
         starts_with("gr"), largest_lang_group, urban, qualified, runoff, 
         ballots, valid, max_votes_byparty, domparty_district_simple, socdem_nat)

district_1907_r2 <- read_excel("data/district_data_1907.xlsx", 
                               sheet = "Round 2") %>% 
  select(district_name, domparty_district_simple2, socdem_nat2)

district_1907_gal <- read_excel("data/district_data_1907.xlsx", 
                                sheet = "Galcia, 2-Member Districts",
                                range = "A2:S38") %>% 
  clean_names() %>% 
  rename(precincts = name,
         sercro = ser_cro,
         ruthen = ruthenian) %>% 
  rename_with(~ glue("gr{.x}"), croat:slovene) %>% 
  mutate(district_name = glue("{province}{wahlbezirk_number}"),
         election = 1907,
         dm = 2) %>%
  select(-c(wahlbezirk_number, only_1st_2nd_narrow))


district_1907 <- district_1907_r1 %>% 
  bind_rows(district_1907_gal) %>%
  left_join(district_id, by = "district_name") %>% 
  left_join(district_1907_r2, by = "district_name") %>% 
  arrange(district_id) %>% 
  mutate(max_votes_byparty = case_when(district_name == "Silesia9" ~ 2208, # correction due to sum of votes of 2 candidates
                                       district_name == "Bukovina14" ~ 4340, # correction due to sum of votes of 2 candidates 
                                       TRUE ~ max_votes_byparty),
         max_vote_share = max_votes_byparty / valid,
         partylabel = if_else(runoff == 1, domparty_district_simple2, domparty_district_simple),
         socdem_nat = if_else(runoff == 1, socdem_nat2, socdem_nat)) %>% 
  left_join(party_labels_1907, by = "partylabel") %>%
  mutate(pop_imp = case_when(pop == 0 ~ NA_real_, 
                             TRUE ~ pop),
         imp = case_when(pop == 0 ~ 1,
                         lead(pop) == 0 ~ 1,
                         TRUE ~ 0),
         city = precincts %>% str_replace("Wien,", "Wien") %>% 
           str_replace("XVI,", "XVI.") %>% 
           str_remove_all("(?=\\,).*")) %>% 
  group_by(city, imp) %>% 
  fill(pop_imp, .direction = "down") %>% # impute population for district where only city data available 
  ungroup() %>% 
  mutate(grgerman = case_when(province == "LA" | 
                                province == "Salzburg" |
                                province == "UA" |
                                province == "Vorarlberg"|
                                province == "Moravia-German" ~ pop_imp, 
                              TRUE ~ grgerman),
         grczech = case_when(province == "Moravia-Czech" ~ pop_imp, 
                             TRUE ~ grczech),
         imp_grczech = case_when(grczech == 0 ~ 1,
                                 lead(grczech) == 0 ~ 1,
                                 TRUE ~ 0),
         imp_grgerman = case_when(grgerman == 0 ~ 1,
                                  lead(grgerman) == 0 ~ 1,
                                  TRUE ~ 0),
         imp_gritalian = case_when(gritalian == 0 ~ 1,
                                   lead(gritalian) == 0 ~ 1,
                                   TRUE ~ 0),
         imp_grpolish = case_when(grpolish == 0 ~ 1,
                                  lead(grpolish) == 0 ~ 1,
                                  TRUE ~ 0),
         imp_grruthen = case_when(grruthen == 0 ~ 1,
                                  lead(grruthen) == 0 ~ 1,
                                  TRUE ~ 0),
         imp_grromanian = case_when(grromanian == 0 ~ 1,
                                    lead(grromanian) == 0 ~ 1,
                                    TRUE ~ 0)) %>%
  mutate(grczech_imp = grczech %>% 
           case_match(., 0 ~ NA, NA ~ 0, .default = .),
         grgerman_imp = grgerman %>% 
           case_match(., 0 ~ NA, NA ~ 0, .default = .),
         gritalian_imp = gritalian %>% 
           case_match(., 0 ~ NA, NA ~ 0, .default = .),
         grpolish_imp = grpolish %>% 
           case_match(., 0 ~ NA, NA ~ 0, .default = .),
         grruthen_imp = grruthen %>% 
           case_match(., 0 ~ NA, NA ~ 0, .default = .),
         grromanian_imp = grromanian %>% 
           case_match(., 0 ~ NA, NA ~ 0, .default = .)) %>% 
  group_by(city) %>% 
  fill(grczech_imp:grromanian_imp, .direction = "down") %>% 
  ungroup() %>%
  mutate(grcroat_imp = if_else(is.na(grcroat), 0, grcroat), 
         grserb_imp= if_else(is.na(grserb), 0, grserb), 
         grsercro_imp = if_else(is.na(grsercro), 0, grsercro), 
         grslovene_imp = if_else(is.na(grslovene), 0, grslovene)) %>% 
  select(election, district_id, district_name, district_adlg, precincts, province, 
         dm, largest_lang_group, urban, qualified, runoff, ballots, valid, 
         max_votes_byparty, max_vote_share, partylabel, 
         ideology, ends_with("_imp"))
```


## Prepare district data for second term (1911-1914)

```{r}
district_1911_r1 <- read_excel("data/district_data_1911.xlsx", 
                               sheet = "Round 1") %>% 
  select(election, district_name, precincts, province, dm, pop, 
         starts_with("gr"), largest_lang_group, urban, qualified, runoff, 
         ballots, valid, max_votes_byparty, domparty_district_simple, socdem_nat)

district_1911_r2 <- read_excel("data/district_data_1911.xlsx", 
                               sheet = "Round 2") %>% 
  select(district_name, domparty_district_simple2, socdem_nat2)


district_1911_gal <- read_excel("data/district_data_1911.xlsx", 
                                sheet = "Galcia, 2-Member Districts",
                                range = "A2:S38") %>% 
  clean_names() %>% 
  rename(precincts = name,
         sercro = ser_cro,
         ruthen = ruthenian) %>% 
  rename_with(~ glue("gr{.x}"), croat:slovene) %>% 
  mutate(district_name = glue("{province}{wahlbezirk_number}"),
         election = 1911,
         dm = 2) %>%
  select(-c(wahlbezirk_number, only_1st_2nd_narrow))


district_1911 <- district_1911_r1 %>% 
  bind_rows(district_1911_gal) %>% 
  left_join(district_id, by = "district_name") %>% 
  left_join(district_1911_r2, by = "district_name") %>% 
  left_join(district_1907_r1 %>% select(district_name, pop, qualified), 
            by = "district_name", suffix = c("", "_07")) %>%
  arrange(district_id) %>% 
  mutate(max_vote_share = max_votes_byparty / valid,
         partylabel = if_else(runoff == 1, domparty_district_simple2, domparty_district_simple),
         socdem_nat = if_else(runoff == 1, socdem_nat2, socdem_nat),
         pop = case_when(is.na(pop) & province == "Tyrol" ~ grgerman + gritalian,
                         is.na(pop) & province != "Tyrol" ~ pop_07,
                         TRUE ~ pop) %>% floor()) %>% 
  left_join(party_labels_1911, by = "partylabel") %>%
  mutate(pop_imp = case_when(pop == 0 ~ NA_real_, 
                             TRUE ~ pop),
         imp = case_when(pop == 0 ~ 1,
                         lead(pop) == 0 ~ 1,
                         TRUE ~ 0),
         city = precincts %>% str_replace("Wien,", "Wien") %>% 
           str_replace("XVI,", "XVI.") %>% 
           str_remove_all("(?=\\,).*")) %>% 
  group_by(city, imp) %>% 
  fill(pop_imp, .direction = "down") %>% # impute population for district where only city data available 
  ungroup() %>% 
  mutate(grgerman = case_when(province == "LA" | 
                                province == "Salzburg" |
                                province == "UA" |
                                province == "Vorarlberg"|
                                province == "Moravia-German" ~ pop_imp, 
                              TRUE ~ grgerman),
         grczech = case_when(province == "Moravia-Czech" ~ pop_imp, 
                             TRUE ~ grczech),
         imp_grczech = case_when(grczech == 0 ~ 1,
                                 lead(grczech) == 0 ~ 1,
                                 TRUE ~ 0),
         imp_grgerman = case_when(grgerman == 0 ~ 1,
                                  lead(grgerman) == 0 ~ 1,
                                  TRUE ~ 0),
         imp_gritalian = case_when(gritalian == 0 ~ 1,
                                   lead(gritalian) == 0 ~ 1,
                                   TRUE ~ 0),
         imp_grpolish = case_when(grpolish == 0 ~ 1,
                                  lead(grpolish) == 0 ~ 1,
                                  TRUE ~ 0),
         imp_grruthen = case_when(grruthen == 0 ~ 1,
                                  lead(grruthen) == 0 ~ 1,
                                  TRUE ~ 0),
         imp_grromanian = case_when(grromanian == 0 ~ 1,
                                    lead(grromanian) == 0 ~ 1,
                                    TRUE ~ 0)) %>%
  mutate(grczech_imp = grczech %>% 
           case_match(., 0 ~ NA, NA ~ 0, .default = .),
         grgerman_imp = grgerman %>% 
           case_match(., 0 ~ NA, NA ~ 0, .default = .),
         gritalian_imp = gritalian %>% 
           case_match(., 0 ~ NA, NA ~ 0, .default = .),
         grpolish_imp = grpolish %>% 
           case_match(., 0 ~ NA, NA ~ 0, .default = .),
         grruthen_imp = grruthen %>% 
           case_match(., 0 ~ NA, NA ~ 0, .default = .),
         grromanian_imp = grromanian %>% 
           case_match(., 0 ~ NA, NA ~ 0, .default = .)) %>% 
  group_by(city) %>% 
  fill(grczech_imp:grromanian_imp, .direction = "down") %>% 
  ungroup() %>%
  mutate(grcroat_imp = if_else(is.na(grcroat), 0, grcroat), 
         grserb_imp= if_else(is.na(grserb), 0, grserb), 
         grsercro_imp = if_else(is.na(grsercro), 0, grsercro), 
         grslovene_imp = if_else(is.na(grslovene), 0, grslovene)) %>% 
  select(election, district_id, district_name, district_adlg, precincts, province, 
         dm, largest_lang_group, urban, qualified, runoff, ballots, valid, 
         max_votes_byparty, max_vote_share, partylabel,
         ideology, ends_with("_imp"))
```


## Prepare voters occupational data

```{r}
occ <- read_excel("data/voters_occupation.xlsx") %>% 
  filter(status == "z.") %>%
  mutate(across(LaFo_s:OefFreOhn, ~ str_remove(.x, "\\.") %>% as.numeric(.x))) %>% 
  rowwise() %>% 
  mutate(industry = sum(c_across(c(InGeGro_s, InGeGro_b, InGeGro_a, InGeKle_s, 
                                   InGeKle_b, InGeKle_a, HaVerPos_b, HaVerPos_k,
                                   HaVerPos_a, HaVerEis_b, HaVerEis_k, HaVerEis_a, 
                                   HaVerSon_s, HaVerSon_b, HaVerSon_a)), na.rm = TRUE),
         agriculture = sum(c_across(c(LaFo_s, LaFo_b, LaFo_a, LaFo_h)), na.rm = TRUE),
         public = sum(c_across(c(OefFreHof_b, OefFreHof_k, OefFreHof_a, 
                                 OefFreAnd_b, OefFreAnd_l, OefFreAnd_g, 
                                 OefFreAnd_k, OefFreAnd_a)), na.rm = TRUE)) %>% 
  ungroup() %>% 
  select(district_name, industry, agriculture, public)
```


## Calculate economic sector sizes

```{r}
district_1907_occ <- district_1907 %>% 
  left_join(occ, by = "district_name") %>% 
  mutate(p_industry = industry/qualified,
         p_agriculture = agriculture/qualified,
         p_public = public/qualified,
         legislature = 1907)

district_1911_occ <- district_1911 %>% 
  left_join(occ, by = "district_name") %>% 
  mutate(p_industry = industry/qualified,
         p_agriculture = agriculture/qualified,
         p_public = public/qualified,
         legislature = 1911)

district_occ <- district_1907_occ %>% bind_rows(district_1911_occ) %>% 
  select(district_id, district_name, precincts, dm, urban:grslovene_imp, 
         starts_with("p_"), legislature) %>% 
  unique()
```


## Merge all data into one dataset

```{r}
data_full <- rcv_reps %>% 
  left_join(district_occ, by = c("legislature", "district_name")) %>% 
  filter(vote != "no_rep",
         dm != 2) %>% # drop 2-member districts from Galicia
  mutate(region = case_when(province == "Moravia-Czech" ~ "Moravia",
                            province == "Moravia-German" ~ "Moravia",
                            TRUE ~ province),
         max_lang_gr = pmax(grczech_imp, grgerman_imp, gritalian_imp, grpolish_imp, grruthen_imp,
                            grromanian_imp, grcroat_imp, grserb_imp, grsercro_imp, grslovene_imp),
         largest_lang_gr = case_when(max_lang_gr == grczech_imp ~ "cze",
                                     max_lang_gr == grgerman_imp ~ "ger",
                                     max_lang_gr == gritalian_imp ~ "ita",
                                     max_lang_gr == grpolish_imp ~ "pol",
                                     max_lang_gr == grruthen_imp ~ "rut",
                                     max_lang_gr == grromanian_imp ~ "rom",
                                     max_lang_gr == grcroat_imp ~ "cro",
                                     max_lang_gr == grserb_imp ~ "ser",
                                     max_lang_gr == grsercro_imp ~ "sercro",
                                     max_lang_gr == grslovene_imp ~ "slo"),
         max_dom_sec = pmax(p_industry, p_agriculture, p_public),
         dom_sec = case_when(max_dom_sec == p_industry ~ "industry",
                             max_dom_sec == p_agriculture ~ "agriculture",
                             max_dom_sec == p_public ~ "public"),
         comp_dist = case_when(max_vote_share <= 1/3 ~ 1,
                               TRUE ~ 0),
         ideology_unclear = case_when(is.na(ideology) ~ "unclear",
                                      TRUE ~ ideology),
         klub_none_min = case_when(klub == "none" ~ glue("none_{rep_id}"),
                                   TRUE ~ klub),
         klub_no_ind_min = case_when(klub_none_min %>% str_detect("none") ~ NA_character_,
                                     TRUE ~ klub_none_min),
         klub_none_max = case_when(klub == "none" ~ glue("none_{rep_id}"),
                                   
                                   klub == "bohsoc" ~ "socdem",
                                   klub == "gersoc" ~ "socdem",
                                   klub == "posoc" ~ "socdem",
                                   klub == "itasoc" ~ "socdem",
                                   klub == "ruksoc" ~ "socdem",
                                   
                                   klub == "geragr" ~ "gernatpan",
                                   klub == "gerlab" ~ "gernatpan",
                                   klub == "gernat" ~ "gernatpan",
                                   klub == "gerpro" ~ "gernatpan",
                                   klub == "gerrad" ~ "gernatpan",
                                   klub == "geryou" ~ "gernatpan",
                                   
                                   session_detailed >= 203 & klub == "catnat" ~ "bohun",
                                   session_detailed >= 203 & klub == "cze" ~ "bohun",
                                   session_detailed >= 203 & klub == "czeagr" ~ "bohun",
                                   #session_detailed >= 203 & klub == "czenatsocradpro" ~ "bohun", # not in data
                                   session_detailed >= 211 & session_detailed <= 212 & klub == "czenatsoc" ~ "bohun",
                                   session_detailed >= 211 & session_detailed <= 212 & klub == "czein" ~ "bohun",
                                   
                                   session_detailed >= 185 & klub == "slo" ~ "narsve",
                                   session_detailed >= 185 & klub == "nat" ~ "narsve",
                                   session_detailed >= 185 & session_detailed <= 203 & klub == "sousla" ~ "narsve",
                                   
                                   #session_detailed >= 211 & klub == "ukkl" ~ "uk", # only in Galicia
                                   #session_detailed >= 211 & klub == "ukrad" ~ "uk", # only in Galicia
                                   session_detailed >= 211 & klub == "rutbuk" ~ "uk",
                                   
                                   session_detailed >= 211 & klub == "italib" ~ "unilat",
                                   session_detailed >= 211 & klub == "itapeo" ~ "unilat",
                                   session_detailed >= 211 & klub == "rom" ~ "unilat",
                                   
                                   TRUE ~ klub),
         klub_no_ind_max = case_when(klub_none_max %>% str_detect("none") ~ NA_character_,
                                     TRUE ~ klub_none_max),
         partylabel_none = case_when(partylabel == "ind" ~ glue("ind_{rep_id}"), 
                                     partylabel == "indsoc" ~ glue("indsoc_{rep_id}"),
                                     partylabel == "noparty" ~ glue("ind_{rep_id}"),
                                     TRUE ~ partylabel),
         partylabel_no_ind = case_when(partylabel_none %>% str_detect("ind") ~ NA_character_,
                                       TRUE ~ partylabel_none),
         check = NA) %>% 
  unite("assoc_prof", starts_with("v_"), remove = FALSE) %>%
  select(-c(max_lang_gr, max_dom_sec))

save(data_full, file = "data_full.RData")
#load("data_full.RData")
```



# Data preparation for the Dyadic analysis

## Prepare dyadic dataset by session volume 

```{r}
session_data <- data_full %>% 
  select(-c(rcv, bill, topic, issue, issue_area, vote_date, rep_vote_id, vote, check)) %>% 
  distinct() %>% 
  mutate(rep_session_id = glue("{rep_id}_{session_detailed}"),
         across(where(is.character), as.factor))

# Create collapsed list of rep_ids per detailed session
session_data_collapsed <- session_data %>%
  group_by(session_detailed) %>%
  summarise(rep_id_2 = list(rep_id), .groups = 'drop')

session_data_match <- session_data %>%
  left_join(session_data_collapsed, by = "session_detailed") %>%
  rowwise() %>%
  mutate(rep_id_2 = list(setdiff(rep_id_2, rep_id))) %>%
  unnest(rep_id_2) %>%
  ungroup() %>%
  mutate(rep_min = pmin(rep_id, rep_id_2),
         rep_max = pmax(rep_id, rep_id_2),
         dyad_id = str_c(rep_min, rep_max, sep = "_") %>% as.factor(),
         dyad_session_id = glue("{dyad_id}_{session_detailed}") %>% as.factor(),
         rep_2_session_id = glue("{rep_id_2}_{session_detailed}") %>% as.factor()) %>%
  distinct(dyad_session_id, .keep_all = TRUE) %>% # each observation twice or only once
  select(-rep_min, -rep_max) %>%
  left_join(session_data %>% select(-c(legislature, session, session_detailed)), 
            by = c("rep_2_session_id" = "rep_session_id"), 
            suffix = c("_rep1", "_rep2")) %>% 
  mutate(klub_min_same = ifelse(klub_none_min_rep1 == klub_none_min_rep2, 1, 0),
         klub_max_same = ifelse(klub_none_max_rep1 == klub_none_max_rep2, 1, 0),
         klub_no_ind_min_same = ifelse(klub_no_ind_min_rep1 == klub_no_ind_min_rep2, 1, 0),
         klub_no_ind_max_same = ifelse(klub_no_ind_max_rep1 == klub_no_ind_max_rep2, 1, 0),
         partylabel_same = ifelse(partylabel_none_rep1 == partylabel_none_rep2, 1, 0),
         partylabel_no_ind_same = ifelse(partylabel_no_ind_rep1 == partylabel_no_ind_rep2, 1, 0),
         ideology_same = ifelse(ideology_rep1 == ideology_rep2, 1, 0),
         ideology_unclear_same = ifelse(ideology_unclear_rep1 == ideology_unclear_rep2, 1, 0),
         region_same = ifelse(region_rep1 == region_rep2, 1, 0),
         province_same = ifelse(province_rep1 == province_rep2, 1, 0),
         district_same = ifelse(district_name_rep1 == district_name_rep2, 1, 0),
         largest_lang_gr_same = ifelse(largest_lang_gr_rep1 == largest_lang_gr_rep2, 1, 0),
         urban_rural_same = ifelse(urban_rep1 == urban_rep2, 1, 0),
         urban_rural = case_when(urban_rep1 == 1 & urban_rep2 == 1 ~ "both urban",
                                 urban_rep1 == 0 & urban_rep2 == 0 ~ "both rural",
                                 urban_rep1 == 1 & urban_rep2 == 0 ~ "different",
                                 urban_rep1 == 0 & urban_rep2 == 1 ~ "different",
                                 TRUE ~ NA_character_) %>% as.factor(),
         verein_same = case_when(v_lawyer_rep1 == 1 & v_lawyer_rep2 == 1 ~ 1,
                                 v_agrarian_rep1 == 1 & v_agrarian_rep2 == 1 ~ 1,
                                 v_commerce_rep1 == 1 & v_commerce_rep2 == 1 ~ 1,
                                 v_industry_rep1 == 1 & v_industry_rep2 == 1 ~ 1,
                                 v_resort_rep1 == 1 & v_resort_rep2 == 1 ~ 1,
                                 v_gerworker_rep1 == 1 & v_gerworker_rep2 == 1 ~ 1,
                                 v_judge_rep1 == 1 & v_judge_rep2 == 1 ~ 1,
                                 v_doctor_rep1 == 1 & v_doctor_rep2 == 1 ~ 1,
                                 v_civser_rep1 == 1 & v_civser_rep2 == 1 ~ 1,
                                 v_technician_rep1 == 1 & v_technician_rep2 == 1 ~ 1,
                                 v_chrisworker_rep1 == 1 & v_chrisworker_rep2 == 1 ~ 1,
                                 v_markets_rep1 == 1 & v_markets_rep2 == 1 ~ 1,
                                 v_priest_rep1 == 1 & v_priest_rep2 == 1 ~ 1,
                                 v_teachermid_rep1 == 1 & v_teachermid_rep2 == 1 ~ 1,
                                 v_monument_rep1 == 1 & v_monument_rep2 == 1 ~ 1,
                                 v_prof_rep1 == 1 & v_prof_rep2 == 1 ~ 1,
                                 v_teacher_rep1 == 1 & v_teacher_rep2 == 1 ~ 1,
                                 TRUE ~ 0),
         dom_sec_same = ifelse(dom_sec_rep1 == dom_sec_rep2, 1, 0),
         dom_sec_ai_same = case_when(dom_sec_rep1 == "agriculture" & dom_sec_rep2 == "agriculture" ~ "both agriculture",
                                     dom_sec_rep1 == "industry" & dom_sec_rep2 == "industry" ~ "both industry",
                                     dom_sec_rep1 != dom_sec_rep2 ~ "different dominant sector",
                                     TRUE ~ NA_character_) %>% as.factor(),
         comp_dist_33_same = ifelse(comp_dist_rep1 == comp_dist_rep2, 1, 0),
         comp_dist_50_same = ifelse(runoff_rep1 == runoff_rep2, 1, 0)) %>%
  select(legislature, session, session_detailed, rep_id_rep1, rep_session_id,
         rep_id_2, rep_2_session_id, dyad_id, dyad_session_id, urban_rural,
         ends_with("_same"))

save(session_data_match, file = "session_data_match.RData")
#load("session_data_match.RData")
```


## Prepare roll call vote data for dydic dataset

```{r}
vote_data <- data_full %>% 
  mutate(rep_session_id = glue("{rep_id}_{session_detailed}"),
         across(where(is.character), as.factor)) %>% 
  select(legislature, session, session_detailed, rcv, rep_id, vote, rep_vote_id)

# Create collapsed list of rep_ids per rcv
vote_data_collapsed <- vote_data %>%
  group_by(rcv) %>%
  summarise(rep_id_2 = list(rep_id), .groups = 'drop')

vote_data_match <- vote_data %>%
  left_join(vote_data_collapsed, by = "rcv") %>%
  rowwise() %>%
  mutate(rep_id_2 = list(setdiff(rep_id_2, rep_id))) %>%
  unnest(rep_id_2) %>%
  ungroup() %>%
  mutate(rep_min = pmin(rep_id, rep_id_2),
         rep_max = pmax(rep_id, rep_id_2),
         dyad_id = str_c(rep_min, rep_max, sep = "_") %>% as.factor(),
         dyad_vote_id = glue("{dyad_id}_{rcv}") %>% as.factor(),
         dyad_session_id = glue("{dyad_id}_{session_detailed}") %>% as.factor(),
         rep_2_vote_id = glue("{rep_id_2}_{rcv}") %>% as.factor(),) %>%
  distinct(dyad_vote_id, .keep_all = TRUE) %>% # each observation twice or only once
  select(-rep_min, -rep_max) %>%
  left_join(vote_data %>% select(rep_vote_id, vote), 
            by = c("rep_2_vote_id" = "rep_vote_id"), 
            suffix = c("_rep1", "_rep2")) %>% 
  mutate(vote_same = case_when(vote_rep1 == vote_rep2 ~ 1, 
                               TRUE ~ 0),
         vote_same_yn = case_when(vote_rep1 == vote_rep2 & vote_rep1 != "abstain_absent" ~ 1,
                                  vote_rep1 == "abstain_absent" ~ NA_real_,
                                  vote_rep2 == "abstain_absent" ~ NA_real_,
                                  TRUE ~ 0))

save(vote_data_match, file = "vote_data_match.RData")
#load("vote_data_match.RData")
```


## Create dyadic dataset and include only yes-no votes

```{r}
vote_data_aggr_yn <- vote_data_match %>% 
  select(legislature, session, session_detailed, rep_id, rep_id_2, dyad_id, 
         dyad_session_id, vote_same_yn) %>% 
  filter(!is.na(vote_same_yn)) %>% 
  add_count(session_detailed, dyad_id, name = "sum_vote_occasion") %>% 
  group_by(session_detailed, dyad_id) %>% 
  mutate(sum_vote_same = sum(vote_same_yn),
         share_vote_same = sum_vote_same / sum_vote_occasion) %>% 
  distinct(dyad_session_id, .keep_all = TRUE) %>% 
  select(-vote_same_yn) %>% 
  ungroup()


dyadic_analysis_session_yn <- vote_data_aggr_yn %>% 
  left_join(session_data_match %>% 
              select(dyad_session_id, urban_rural, ends_with("_same")),
            by = "dyad_session_id")

save(dyadic_analysis_session_yn, file = "dyadic_analysis_session_yn.RData")
#load("dyadic_analysis_session_yn.RData")
```



# Data preparation for the Deviation analysis

## Calculate group votes

```{r}
vote_by_group_max <- data_full %>% 
  group_by(legislature, rcv, klub_no_ind_max) %>% 
  filter(!is.na(klub_no_ind_max)) %>% 
  count(vote) %>% 
  ungroup() %>% 
  complete(nesting(legislature, rcv, klub_no_ind_max), vote, fill = list(n = 0)) %>% 
  pivot_wider(names_from = vote,
              values_from = n) %>% 
  janitor::clean_names() %>% 
  mutate(max_vote = pmax(nae, yea),
         max_vote_chr = case_when(nae > yea ~ "nae",
                                  yea > nae ~ "yea",
                                  TRUE ~ "not clear"),
         sum_vote = nae + yea,
         max_pot_vote = pmax(nae, abstain_absent, yea),
         max_pot_vote_chr = case_when(nae > abstain_absent & nae > yea ~ "nae",
                                      abstain_absent > nae & abstain_absent > yea ~ "abstain_absent",
                                      yea > nae & yea > abstain_absent ~ "yea",
                                      TRUE ~ "not clear"),
         sum_pot_vote = nae + abstain_absent + yea,
         cohesion = (max_vote - (0.5 * (sum_vote - max_vote))) / sum_vote,
         cohesion_wabsent = (max_pot_vote - (0.5 * (sum_pot_vote - max_pot_vote))) / sum_pot_vote,
         pg_pos = max_pot_vote/sum_pot_vote)

save(vote_by_group_max, file = "vote_by_group_max.RData")
#load("vote_by_group_max.RData")

vote_by_group_min <- data_full %>% 
  group_by(legislature, rcv, klub_no_ind_min) %>% 
  filter(!is.na(klub_no_ind_min)) %>% 
  count(vote) %>% 
  ungroup() %>% 
  complete(nesting(legislature, rcv, klub_no_ind_min), vote, fill = list(n = 0)) %>% 
  pivot_wider(names_from = vote,
              values_from = n) %>% 
  janitor::clean_names() %>% 
  mutate(max_vote = pmax(nae, yea),
         max_vote_chr = case_when(nae > yea ~ "nae",
                                  yea > nae ~ "yea",
                                  TRUE ~ "not clear"),
         sum_vote = nae + yea,
         max_pot_vote = pmax(nae, abstain_absent, yea),
         max_pot_vote_chr = case_when(nae > abstain_absent & nae > yea ~ "nae",
                                      abstain_absent > nae & abstain_absent > yea ~ "abstain_absent",
                                      yea > nae & yea > abstain_absent ~ "yea",
                                      TRUE ~ "not clear"),
         sum_pot_vote = nae + abstain_absent + yea,
         cohesion = (max_vote - (0.5 * (sum_vote - max_vote))) / sum_vote,
         cohesion_wabsent = (max_pot_vote - (0.5 * (sum_pot_vote - max_pot_vote))) / sum_pot_vote,
         pg_pos = max_pot_vote/sum_pot_vote)

save(vote_by_group_min, file = "vote_by_group_min.RData")
#load("vote_by_group_min.RData")
```


## Calculate electoral party votes

```{r}
vote_by_party <- data_full %>% 
  group_by(legislature, rcv, partylabel_no_ind) %>% 
  filter(!is.na(partylabel_no_ind))  %>% 
  count(vote) %>% 
  ungroup() %>% 
  complete(nesting(legislature, rcv, partylabel_no_ind), vote, fill = list(n = 0)) %>% 
  pivot_wider(names_from = vote,
              values_from = n) %>% 
  janitor::clean_names() %>% 
  mutate(max_vote = pmax(nae, yea),
         max_vote_chr = case_when(nae > yea ~ "nae",
                                  yea > nae ~ "yea",
                                  TRUE ~ "not clear"),
         sum_vote = nae + yea,
         max_pot_vote = pmax(nae, abstain_absent, yea),
         max_pot_vote_chr = case_when(nae > abstain_absent & nae > yea ~ "nae",
                                      abstain_absent > nae & abstain_absent > yea ~ "abstain_absent",
                                      yea > nae & yea > abstain_absent ~ "yea",
                                      TRUE ~ "not clear"),
         sum_pot_vote = nae + abstain_absent + yea,
         cohesion = (max_vote - (0.5 * (sum_vote - max_vote))) / sum_vote,
         cohesion_wabsent = (max_pot_vote - (0.5 * (sum_pot_vote - max_pot_vote))) / sum_pot_vote,
         party_pos = max_pot_vote/sum_pot_vote)

save(vote_by_party, file = "vote_by_party.RData")
#load("vote_by_party.RData")
```


## Calculate dominant group ideology

```{r}
ideology_by_group_max <- data_full %>% 
  group_by(legislature, rcv, klub_no_ind_max) %>% 
  filter(!is.na(klub_no_ind_max)) %>% 
  count(ideology_unclear) %>% 
  ungroup() %>% 
  complete(nesting(legislature, rcv, klub_no_ind_max), ideology_unclear, fill = list(n = 0)) %>% 
  pivot_wider(names_from = ideology_unclear,
              values_from = n) %>% 
  janitor::clean_names() %>% 
  mutate(max_ideo = pmax(agr, clecon, libnat, soc),
         max_ideo_chr = case_when(agr > clecon & agr > libnat & agr > soc ~ "agr",
                                  clecon > agr & clecon > libnat & clecon > soc ~ "clecon",
                                  libnat > clecon & libnat > agr & libnat > soc ~ "libnat",
                                  soc > clecon & soc > libnat & soc > agr ~ "soc",
                                  TRUE ~ "not clear"),
         sum_ideo = agr + clecon + libnat + soc,
         max_pot_ideo = pmax(agr, clecon, libnat, soc, unclear),
         max_pot_ideo_chr = case_when(agr > clecon & agr > libnat & agr > soc & agr > unclear ~ "agr",
                                      clecon > agr & clecon > libnat & clecon > soc & clecon > unclear ~ "clecon",
                                      libnat > clecon & libnat > agr & libnat > soc & libnat > unclear ~ "libnat",
                                      soc > clecon & soc > libnat & soc > agr & soc > unclear ~ "soc",
                                      TRUE ~ "not clear"),
         sum_pot_ideo = agr + clecon + libnat + soc + unclear,
         cohesion = (max_ideo - (0.5 * (sum_ideo - max_ideo))) / sum_ideo,
         cohesion_wunclear = (max_pot_ideo - (0.5 * (sum_pot_ideo - max_pot_ideo))) / sum_pot_ideo,
         pg_pos = max_pot_ideo/sum_pot_ideo)

ideology_by_group_min <- data_full %>% 
  group_by(legislature, rcv, klub_no_ind_min) %>% 
  filter(!is.na(klub_no_ind_min)) %>% 
  count(ideology_unclear) %>% 
  ungroup() %>% 
  complete(nesting(legislature, rcv, klub_no_ind_min), ideology_unclear, fill = list(n = 0)) %>% 
  pivot_wider(names_from = ideology_unclear,
              values_from = n) %>% 
  janitor::clean_names() %>% 
  mutate(max_ideo = pmax(agr, clecon, libnat, soc),
         max_ideo_chr = case_when(agr > clecon & agr > libnat & agr > soc ~ "agr",
                                  clecon > agr & clecon > libnat & clecon > soc ~ "clecon",
                                  libnat > clecon & libnat > agr & libnat > soc ~ "libnat",
                                  soc > clecon & soc > libnat & soc > agr ~ "soc",
                                  TRUE ~ "not clear"),
         sum_ideo = agr + clecon + libnat + soc,
         max_pot_ideo = pmax(agr, clecon, libnat, soc, unclear),
         max_pot_ideo_chr = case_when(agr > clecon & agr > libnat & agr > soc & agr > unclear ~ "agr",
                                      clecon > agr & clecon > libnat & clecon > soc & clecon > unclear ~ "clecon",
                                      libnat > clecon & libnat > agr & libnat > soc & libnat > unclear ~ "libnat",
                                      soc > clecon & soc > libnat & soc > agr & soc > unclear ~ "soc",
                                      TRUE ~ "not clear"),
         sum_pot_ideo = agr + clecon + libnat + soc + unclear,
         cohesion = (max_ideo - (0.5 * (sum_ideo - max_ideo))) / sum_ideo,
         cohesion_wunclear = (max_pot_ideo - (0.5 * (sum_pot_ideo - max_pot_ideo))) / sum_pot_ideo,
         pg_pos = max_pot_ideo/sum_pot_ideo)
```


## Calculate group and electoral party dominant nationalities

```{r}
language_by_group_max <- data_full %>% 
  group_by(legislature, rcv, klub_no_ind_max) %>% 
  filter(!is.na(klub_no_ind_max)) %>% 
  count(largest_lang_gr) %>% 
  ungroup() %>% 
  complete(nesting(legislature, rcv, klub_no_ind_max), largest_lang_gr, fill = list(n = 0)) %>% 
  pivot_wider(names_from = largest_lang_gr,
              values_from = n) %>% 
  janitor::clean_names() %>% 
  mutate(max_lang = pmax(cro, cze, ger, ita, pol, rom, rut, ser, sercro, slo),
         max_lang_chr = case_when(cro > cze & cro > ger & cro > ita & cro > pol & cro > rom & 
                                    cro > rut & cro > ser & cro > sercro & cro > slo ~ "cro",
                                  cze > cro & cze > ger & cze > ita & cze > pol & cze > rom & 
                                    cze > rut & cze > ser & cze > sercro & cze > slo ~ "cze",
                                  ger > cze & ger > cro & ger > ita & ger > pol & ger > rom & 
                                    ger > rut & ger > ser & ger > sercro & ger > slo ~ "ger",
                                  ita > cze & ita > ger & ita > cro & ita > pol & ita > rom & 
                                    ita > rut & ita > ser & ita > sercro & ita > slo ~ "ita",
                                  pol > cze & pol > ger & pol > ita & pol > cro & pol > rom & 
                                    pol > rut & pol > ser & pol > sercro & pol > slo ~ "pol",
                                  rom > cze & rom > ger & rom > ita & rom > pol & rom > cro & 
                                    rom > rut & rom > ser & rom > sercro & rom > slo ~ "rom",
                                  rut > cze & rut > ger & rut > ita & rut > pol & rut > rom & 
                                    rut > cro & rut > ser & rut > sercro & rut > slo ~ "rut",
                                  ser > cze & ser > ger & ser > ita & ser > pol & ser > rom & 
                                    ser > rut & ser > cro & ser > sercro & ser > slo ~ "ser",
                                  sercro > cze & sercro > ger & sercro > ita & sercro > pol & sercro > rom & 
                                    sercro > rut & sercro > ser & sercro > cro & sercro > slo ~ "sercro",
                                  slo > cze & slo > ger & slo > ita & slo > pol & slo > rom & 
                                    slo > rut & slo > ser & slo > sercro & slo > cro ~ "slo",
                                  TRUE ~ "not clear"),
         sum_lang = cro + cze + ger + ita + pol + rom + rut + ser + sercro + slo,
         cohesion = (max_lang - (0.5 * (sum_lang - max_lang))) / sum_lang,
         pg_pos = max_lang/sum_lang)

language_by_group_min <- data_full %>% 
  group_by(legislature, rcv, klub_no_ind_min) %>% 
  filter(!is.na(klub_no_ind_min)) %>% 
  count(largest_lang_gr) %>% 
  ungroup() %>% 
  complete(nesting(legislature, rcv, klub_no_ind_min), largest_lang_gr, fill = list(n = 0)) %>% 
  pivot_wider(names_from = largest_lang_gr,
              values_from = n) %>% 
  janitor::clean_names() %>% 
  mutate(max_lang = pmax(cro, cze, ger, ita, pol, rom, rut, ser, sercro, slo),
         max_lang_chr = case_when(cro > cze & cro > ger & cro > ita & cro > pol & cro > rom & 
                                    cro > rut & cro > ser & cro > sercro & cro > slo ~ "cro",
                                  cze > cro & cze > ger & cze > ita & cze > pol & cze > rom & 
                                    cze > rut & cze > ser & cze > sercro & cze > slo ~ "cze",
                                  ger > cze & ger > cro & ger > ita & ger > pol & ger > rom & 
                                    ger > rut & ger > ser & ger > sercro & ger > slo ~ "ger",
                                  ita > cze & ita > ger & ita > cro & ita > pol & ita > rom & 
                                    ita > rut & ita > ser & ita > sercro & ita > slo ~ "ita",
                                  pol > cze & pol > ger & pol > ita & pol > cro & pol > rom & 
                                    pol > rut & pol > ser & pol > sercro & pol > slo ~ "pol",
                                  rom > cze & rom > ger & rom > ita & rom > pol & rom > cro & 
                                    rom > rut & rom > ser & rom > sercro & rom > slo ~ "rom",
                                  rut > cze & rut > ger & rut > ita & rut > pol & rut > rom & 
                                    rut > cro & rut > ser & rut > sercro & rut > slo ~ "rut",
                                  ser > cze & ser > ger & ser > ita & ser > pol & ser > rom & 
                                    ser > rut & ser > cro & ser > sercro & ser > slo ~ "ser",
                                  sercro > cze & sercro > ger & sercro > ita & sercro > pol & sercro > rom & 
                                    sercro > rut & sercro > ser & sercro > cro & sercro > slo ~ "sercro",
                                  slo > cze & slo > ger & slo > ita & slo > pol & slo > rom & 
                                    slo > rut & slo > ser & slo > sercro & slo > cro ~ "slo",
                                  TRUE ~ "not clear"),
         sum_lang = cro + cze + ger + ita + pol + rom + rut + ser + sercro + slo,
         cohesion = (max_lang - (0.5 * (sum_lang - max_lang))) / sum_lang,
         pg_pos = max_lang/sum_lang)

language_by_party <- data_full %>% 
  group_by(legislature, rcv, partylabel_no_ind) %>% 
  filter(!is.na(partylabel_no_ind)) %>% 
  count(largest_lang_gr) %>% 
  ungroup() %>% 
  complete(nesting(legislature, rcv, partylabel_no_ind), largest_lang_gr, fill = list(n = 0)) %>% 
  pivot_wider(names_from = largest_lang_gr,
              values_from = n) %>% 
  janitor::clean_names() %>% 
  mutate(max_lang = pmax(cro, cze, ger, ita, pol, rom, rut, ser, sercro, slo),
         max_lang_chr = case_when(cro > cze & cro > ger & cro > ita & cro > pol & cro > rom & 
                                    cro > rut & cro > ser & cro > sercro & cro > slo ~ "cro",
                                  cze > cro & cze > ger & cze > ita & cze > pol & cze > rom & 
                                    cze > rut & cze > ser & cze > sercro & cze > slo ~ "cze",
                                  ger > cze & ger > cro & ger > ita & ger > pol & ger > rom & 
                                    ger > rut & ger > ser & ger > sercro & ger > slo ~ "ger",
                                  ita > cze & ita > ger & ita > cro & ita > pol & ita > rom & 
                                    ita > rut & ita > ser & ita > sercro & ita > slo ~ "ita",
                                  pol > cze & pol > ger & pol > ita & pol > cro & pol > rom & 
                                    pol > rut & pol > ser & pol > sercro & pol > slo ~ "pol",
                                  rom > cze & rom > ger & rom > ita & rom > pol & rom > cro & 
                                    rom > rut & rom > ser & rom > sercro & rom > slo ~ "rom",
                                  rut > cze & rut > ger & rut > ita & rut > pol & rut > rom & 
                                    rut > cro & rut > ser & rut > sercro & rut > slo ~ "rut",
                                  ser > cze & ser > ger & ser > ita & ser > pol & ser > rom & 
                                    ser > rut & ser > cro & ser > sercro & ser > slo ~ "ser",
                                  sercro > cze & sercro > ger & sercro > ita & sercro > pol & sercro > rom & 
                                    sercro > rut & sercro > ser & sercro > cro & sercro > slo ~ "sercro",
                                  slo > cze & slo > ger & slo > ita & slo > pol & slo > rom & 
                                    slo > rut & slo > ser & slo > sercro & slo > cro ~ "slo",
                                  TRUE ~ "not clear"),
         sum_lang = cro + cze + ger + ita + pol + rom + rut + ser + sercro + slo,
         cohesion = (max_lang - (0.5 * (sum_lang - max_lang))) / sum_lang,
         pg_pos = max_lang/sum_lang)
```


## Calculate group and electoral party dominant economic sectors

```{r}
sector_by_group_max <- data_full %>% 
  group_by(legislature, rcv, klub_no_ind_max) %>% 
  filter(!is.na(klub_no_ind_max)) %>% 
  count(dom_sec) %>% 
  ungroup() %>% 
  complete(nesting(legislature, rcv, klub_no_ind_max), dom_sec, fill = list(n = 0)) %>% 
  pivot_wider(names_from = dom_sec,
              values_from = n) %>% 
  janitor::clean_names() %>% 
  mutate(max_sector = pmax(agriculture, industry),
         max_sector_chr = case_when(agriculture > industry ~ "agriculture",
                                    industry > agriculture ~ "industry",
                                    TRUE ~ "not clear"),
         sum_sector = agriculture + industry,
         max_pot_sector = pmax(agriculture, industry),
         max_pot_sector_chr = case_when(agriculture > industry & agriculture > na ~ "agriculture",
                                        industry > agriculture & industry > na ~ "industry",
                                        TRUE ~ "not clear"),
         sum_pot_sector = agriculture + industry,
         cohesion = (max_sector - (0.5 * (sum_sector - max_sector))) / sum_sector,
         cohesion_wunclear = (max_pot_sector - (0.5 * (sum_pot_sector - max_pot_sector))) / sum_pot_sector,
         pg_pos = max_pot_sector/sum_pot_sector)

sector_by_group_min <- data_full %>% 
  group_by(legislature, rcv, klub_no_ind_min) %>% 
  filter(!is.na(klub_no_ind_min)) %>% 
  count(dom_sec) %>% 
  ungroup() %>% 
  complete(nesting(legislature, rcv, klub_no_ind_min), dom_sec, fill = list(n = 0)) %>% 
  pivot_wider(names_from = dom_sec,
              values_from = n) %>% 
  janitor::clean_names() %>% 
  mutate(max_sector = pmax(agriculture, industry),
         max_sector_chr = case_when(agriculture > industry ~ "agriculture",
                                    industry > agriculture ~ "industry",
                                    TRUE ~ "not clear"),
         sum_sector = agriculture + industry,
         max_pot_sector = pmax(agriculture, industry),
         max_pot_sector_chr = case_when(agriculture > industry & agriculture > na ~ "agriculture",
                                        industry > agriculture & industry > na ~ "industry",
                                        TRUE ~ "not clear"),
         sum_pot_sector = agriculture + industry,
         cohesion = (max_sector - (0.5 * (sum_sector - max_sector))) / sum_sector,
         cohesion_wunclear = (max_pot_sector - (0.5 * (sum_pot_sector - max_pot_sector))) / sum_pot_sector,
         pg_pos = max_pot_sector/sum_pot_sector)

sector_by_party <- data_full %>% 
  group_by(legislature, rcv, partylabel_no_ind) %>% 
  filter(!is.na(partylabel_no_ind)) %>% 
  count(dom_sec) %>% 
  ungroup() %>% 
  complete(nesting(legislature, rcv, partylabel_no_ind), dom_sec, fill = list(n = 0)) %>% 
  pivot_wider(names_from = dom_sec,
              values_from = n) %>% 
  janitor::clean_names() %>% 
  mutate(max_sector = pmax(agriculture, industry),
         max_sector_chr = case_when(agriculture > industry ~ "agriculture",
                                    industry > agriculture ~ "industry",
                                    TRUE ~ "not clear"),
         sum_sector = agriculture + industry,
         max_pot_sector = pmax(agriculture, industry),
         max_pot_sector_chr = case_when(agriculture > industry & agriculture > na ~ "agriculture",
                                        industry > agriculture & industry > na ~ "industry",
                                        TRUE ~ "not clear"),
         sum_pot_sector = agriculture + industry,
         cohesion = (max_sector - (0.5 * (sum_sector - max_sector))) / sum_sector,
         cohesion_wunclear = (max_pot_sector - (0.5 * (sum_pot_sector - max_pot_sector))) / sum_pot_sector,
         pg_pos = max_pot_sector/sum_pot_sector)
```


## Calculate group and electoral party dominant association portfolio

```{r}
assoc_by_group_max <- data_full %>% 
  group_by(legislature, rcv, klub_no_ind_max) %>% 
  filter(!is.na(klub_no_ind_max)) %>% 
  count(assoc_prof) %>% 
  ungroup() %>% 
  complete(nesting(legislature, rcv, klub_no_ind_max), assoc_prof, fill = list(n = 0)) %>% 
  pivot_wider(names_from = assoc_prof,
              values_from = n) %>% 
  rowwise() %>%
  mutate(max_assoc_prof = max(c_across(4:141), na.rm = TRUE),
         max_assoc_prof_chr = paste0(names(pick(4:141))[which(c_across(4:141) == max(c_across(4:141), na.rm = TRUE))], collapse = "-"),
         max_assoc_prof_chr = ifelse(str_detect(max_assoc_prof_chr, "-"), "not clear", max_assoc_prof_chr %>% str_remove("x"))) %>%
  ungroup() %>% 
  select(1:3, 142:143)

assoc_by_group_min <- data_full %>% 
  group_by(legislature, rcv, klub_no_ind_min) %>% 
  filter(!is.na(klub_no_ind_min)) %>% 
  count(assoc_prof) %>% 
  ungroup() %>% 
  complete(nesting(legislature, rcv, klub_no_ind_min), assoc_prof, fill = list(n = 0)) %>% 
  pivot_wider(names_from = assoc_prof,
              values_from = n) %>% 
  rowwise() %>%
  mutate(max_assoc_prof = max(c_across(4:141), na.rm = TRUE),
         max_assoc_prof_chr = paste0(names(pick(4:141))[which(c_across(4:141) == max(c_across(4:141), na.rm = TRUE))], collapse = "-"),
         max_assoc_prof_chr = ifelse(str_detect(max_assoc_prof_chr, "-"), "not clear", max_assoc_prof_chr %>% str_remove("x"))) %>%
  ungroup() %>% 
  select(1:3, 142:143)

assoc_by_party <- data_full %>% 
  group_by(legislature, rcv, partylabel_no_ind) %>% 
  filter(!is.na(partylabel_no_ind)) %>% 
  count(assoc_prof) %>% 
  ungroup() %>% 
  complete(nesting(legislature, rcv, partylabel_no_ind), assoc_prof, fill = list(n = 0)) %>% 
  pivot_wider(names_from = assoc_prof,
              values_from = n) %>% 
  rowwise() %>%
  mutate(max_assoc_prof = max(c_across(4:142), na.rm = TRUE),
         max_assoc_prof_chr = paste0(names(pick(4:142))[which(c_across(4:142) == max(c_across(4:142), na.rm = TRUE))], collapse = "-"),
         max_assoc_prof_chr = ifelse(str_detect(max_assoc_prof_chr, "-"), "not clear", max_assoc_prof_chr %>% str_remove("x"))) %>%
  ungroup() %>% 
  select(1:3, 143:144)
```


## Prepare dataset for deviation analysis from club vote including umbrella organizations

```{r}
dev_analysis_group_max <- data_full %>% 
  left_join(vote_by_group_max %>% select(rcv, klub_no_ind_max, max_vote_chr, 
                                         sum_vote, cohesion_wabsent), 
            by = join_by(rcv, klub_no_ind_max)) %>% 
  left_join(ideology_by_group_max %>% select(rcv, klub_no_ind_max, max_ideo_chr), 
            by = join_by(rcv, klub_no_ind_max)) %>%
  left_join(language_by_group_max %>% select(rcv, klub_no_ind_max, max_lang_chr), 
            by = join_by(rcv, klub_no_ind_max)) %>%
  left_join(sector_by_group_max %>% select(rcv, klub_no_ind_max, max_sector_chr), 
            by = join_by(rcv, klub_no_ind_max)) %>%
  left_join(assoc_by_group_max %>% select(rcv, klub_no_ind_max, max_assoc_prof_chr), 
            by = join_by(rcv, klub_no_ind_max)) %>%
  rename(max_group = max_vote_chr,
         size_group = sum_vote,
         max_ideology = max_ideo_chr,
         max_language = max_lang_chr,
         max_sector = max_sector_chr,
         max_assoc_prof = max_assoc_prof_chr) %>% 
  mutate(dev_group = case_when(vote == max_group & size_group > 2 ~ 0, 
                               vote != max_group & size_group > 2 ~ 1,
                               TRUE ~ NA_real_),
         dev_ideology = case_when(ideology_unclear == max_ideology & size_group > 2 ~ 0, 
                                  ideology_unclear != max_ideology & size_group > 2 ~ 1,
                                  TRUE ~ NA_real_),
         dev_language = case_when(largest_lang_gr == max_language & size_group > 2 ~ 0, 
                                  largest_lang_gr != max_language & size_group > 2 ~ 1,
                                  TRUE ~ NA_real_),
         dev_sector = case_when(dom_sec == max_sector & size_group > 2 ~ 0, 
                                dom_sec != max_sector & size_group > 2 ~ 1,
                                TRUE ~ NA_real_),
         dev_assoc = case_when(assoc_prof == max_assoc_prof & size_group > 2 ~ 0, 
                               assoc_prof != max_assoc_prof & size_group > 2 ~ 1,
                               TRUE ~ NA_real_)) %>% 
  filter(vote != "abstain_absent",
         max_group != "abstain_absent",
         max_group != "not clear",
         !is.na(dev_group)) %>% 
  select(legislature, rep_id, rcv, dev_group, dev_ideology, dev_language, 
         dev_sector, dev_assoc, max_vote_share)

save(dev_analysis_group_max, file = "dev_analysis_group_max.RData")
```


## Prepare dataset for deviation analysis from club vote excluding umbrella organizations

```{r}
dev_analysis_group_min <- data_full %>% 
  left_join(vote_by_group_min %>% select(rcv, klub_no_ind_min, max_vote_chr, 
                                         sum_vote, cohesion_wabsent), 
            by = join_by(rcv, klub_no_ind_min)) %>% 
  left_join(ideology_by_group_min %>% select(rcv, klub_no_ind_min, max_ideo_chr), 
            by = join_by(rcv, klub_no_ind_min)) %>%
  left_join(language_by_group_min %>% select(rcv, klub_no_ind_min, max_lang_chr), 
            by = join_by(rcv, klub_no_ind_min)) %>%
  left_join(sector_by_group_min %>% select(rcv, klub_no_ind_min, max_sector_chr), 
            by = join_by(rcv, klub_no_ind_min)) %>%
  left_join(assoc_by_group_min %>% select(rcv, klub_no_ind_min, max_assoc_prof_chr), 
            by = join_by(rcv, klub_no_ind_min)) %>%
  rename(max_group = max_vote_chr,
         size_group = sum_vote,
         max_ideology = max_ideo_chr,
         max_language = max_lang_chr,
         max_sector = max_sector_chr,
         max_assoc_prof = max_assoc_prof_chr) %>% 
  mutate(dev_group = case_when(vote == max_group & size_group > 2 ~ 0, 
                               vote != max_group & size_group > 2 ~ 1,
                               TRUE ~ NA_real_),
         dev_ideology = case_when(ideology_unclear == max_ideology & size_group > 2 ~ 0, 
                                  ideology_unclear != max_ideology & size_group > 2 ~ 1,
                                  TRUE ~ NA_real_),
         dev_language = case_when(largest_lang_gr == max_language & size_group > 2 ~ 0, 
                                  largest_lang_gr != max_language & size_group > 2 ~ 1,
                                  TRUE ~ NA_real_),
         dev_sector = case_when(dom_sec == max_sector & size_group > 2 ~ 0, 
                                dom_sec != max_sector & size_group > 2 ~ 1,
                                TRUE ~ NA_real_),
         dev_assoc = case_when(assoc_prof == max_assoc_prof & size_group > 2 ~ 0, 
                               assoc_prof != max_assoc_prof & size_group > 2 ~ 1,
                               TRUE ~ NA_real_)) %>% 
  filter(vote != "abstain_absent",
         max_group != "abstain_absent",
         max_group != "not clear",
         !is.na(dev_group)) %>% 
  select(legislature, rep_id, rcv, dev_group, dev_ideology, dev_language, 
         dev_sector, dev_assoc, max_vote_share)

save(dev_analysis_group_min, file = "dev_analysis_group_min.RData")
```


## Prepare dataset for deviation analysis from electoral party vote

```{r}
dev_analysis_party <- data_full %>% 
  left_join(vote_by_party %>% select(rcv, partylabel_no_ind, max_vote_chr, 
                                     sum_vote, cohesion_wabsent), 
            by = join_by(rcv, partylabel_no_ind)) %>% 
  left_join(language_by_party %>% select(rcv, partylabel_no_ind, max_lang_chr), 
            by = join_by(rcv, partylabel_no_ind)) %>%
  left_join(sector_by_party %>% select(rcv, partylabel_no_ind, max_sector_chr), 
            by = join_by(rcv, partylabel_no_ind)) %>%
  left_join(assoc_by_party %>% select(rcv, partylabel_no_ind, max_assoc_prof_chr), 
            by = join_by(rcv, partylabel_no_ind)) %>%
  rename(max_party = max_vote_chr,
         size_party = sum_vote,
         max_language = max_lang_chr,
         max_sector = max_sector_chr,
         max_assoc_prof = max_assoc_prof_chr) %>% 
  mutate(dev_party = case_when(vote == max_party & size_party > 2 ~ 0, 
                               vote != max_party & size_party > 2 ~ 1,
                               TRUE ~ NA_real_),
         dev_language = case_when(largest_lang_gr == max_language & size_party > 2 ~ 0, 
                                  largest_lang_gr != max_language & size_party > 2 ~ 1,
                                  TRUE ~ NA_real_),
         dev_sector = case_when(dom_sec == max_sector & size_party > 2 ~ 0, 
                                dom_sec != max_sector & size_party > 2 ~ 1,
                                TRUE ~ NA_real_),
         dev_assoc = case_when(assoc_prof == max_assoc_prof & size_party > 2 ~ 0, 
                               assoc_prof != max_assoc_prof & size_party > 2 ~ 1,
                               TRUE ~ NA_real_)) %>% 
  filter(vote != "abstain_absent",
         max_party != "abstain_absent",
         max_party != "not clear",
         !is.na(dev_party)) %>% 
  select(legislature, rep_id, rcv, dev_party,  dev_language, dev_sector, 
         dev_assoc, max_vote_share)

save(dev_analysis_party, file = "dev_analysis_party.RData")
```

